interactive_figures.ipynb¶

Make neat interactive figures using altair with the Nipah RBP DMS data¶

  • Written by Brendan Larsen
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
#input configs
altair_config = None
nipah_config = None

#E2 specific files
func_scores_E2_file = None
binding_E2_file = None
#E3 specific files
func_scores_E3_file = None
binding_E3_file = None

#merged_files
merged_df_file = None
concat_df_file = None

#output plots
output_corr = None
entry_binding_corr_plot_E2_output = None
entry_binding_corr_plot_E3_output = None
corr_entry_binding_large_output = None
combined_binding_output = None
entry_by_site_plot_e2_output = None
entry_by_site_plot_e3_output = None
In [2]:
# Parameters
altair_config = "data/custom_analyses_data/interactive_theme.py"
nipah_config = "nipah_config.yaml"
func_scores_E2_file = "results/filtered_data/entry/e2_entry_filtered.csv"
binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
func_scores_E3_file = "results/filtered_data/entry/e3_entry_filtered.csv"
binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
merged_df_file = "results/filtered_data/entry/e2_e3_entry_filter_merged.csv"
concat_df_file = "results/filtered_data/entry/e2_e3_entry_filter_concat.csv"
output_corr = "results/images/corr_heatmap.html"
entry_binding_corr_plot_E2_output = "results/images/entry_binding_corr_plot_E2.html"
entry_binding_corr_plot_E3_output = "results/images/entry_binding_corr_plot_E3.html"
corr_entry_binding_large_output = "results/images/corr_entry_binding_large.html"
combined_binding_output = "results/images/combined_binding.html"
entry_by_site_plot_e2_output = "results/images/entry_by_site_plot_e2.html"
entry_by_site_plot_e3_output = "results/images/entry_by_site_plot_e3.html"
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")
Setup in correct directory

Setup input file paths¶

In [5]:
if nipah_config is None:
    #input files
    altair_config = 'data/custom_analyses_data/interactive_theme.py'
    nipah_config = 'nipah_config.yaml'
    
    func_scores_E2_file = "results/filtered_data/entry/e2_entry_filtered.csv"
    binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
    
    func_scores_E3_file = "results/filtered_data/entry/e3_entry_filtered.csv"
    binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
    
    antibody_file = 'results/filtered_data/escape/mab_filter_concat.csv'
    merged_df_file = 'results/filtered_data/entry/e2_e3_entry_filter_merged.csv'
    concat_df_file = 'results/filtered_data/entry/e2_e3_entry_filter_concat.csv'
In [6]:
if altair_config:
    with open(altair_config, 'r') as file:
        exec(file.read())

with open(nipah_config) as f:
    config = yaml.safe_load(f)

Import filtered data¶

In [7]:
merged_df = pd.read_csv(merged_df_file) #merged entry scores 
#ab_df = pd.read_csv(antibody_file)
concat_df = pd.read_csv(concat_df_file)
In [8]:
# Read filtered cell entry data
def read_func_data(file,name):
    effect_df = pd.read_csv(file)
    effect_df = effect_df[['site','wildtype','mutant','effect']]
    effect_df['cell_type'] = name
    return effect_df

# Call func to read in cell entry data
e2_func_df = read_func_data(func_scores_E2_file, 'CHO-EFNB2')
e3_func_df = read_func_data(func_scores_E3_file, 'CHO-EFNB3')

# Read filtered binding data
def read_binding_data(file,name):
    binding_df = pd.read_csv(file)
    binding_df = binding_df[['site','wildtype','mutant','binding_mean']]
    binding_df['cell_type'] = name
    return binding_df

# Call func to read in binding data
e2_bind_df = read_binding_data(binding_E2_file,'CHO-EFNB2')
e3_bind_df = read_binding_data(binding_E3_file,'CHO-EFNB3')

# Concat binding and func data, then merge
def concat_dfs(bind1,bind2,entry1,entry2):
    combo_bind_df = pd.concat([bind1,bind2])
    combo_entry_df = pd.concat([entry1,entry2])
    total_merged = pd.merge(combo_bind_df,combo_entry_df,on=['site','wildtype','mutant','cell_type'],how='outer')
    return total_merged

final_merged_df = concat_dfs(e2_bind_df,e3_bind_df,e2_func_df,e3_func_df)
In [9]:
### Ok, now I have different inputs ready to go for plotting. Lets review
# I have my different entry dataframes
display(e2_func_df.head(2))
display(e3_func_df.head(2))
display(concat_df.head(2))
display(merged_df.head(2))
site wildtype mutant effect cell_type
0 71 Q C -1.750 CHO-EFNB2
1 71 Q D -1.164 CHO-EFNB2
site wildtype mutant effect cell_type
0 71 Q C -0.7227 CHO-EFNB3
1 71 Q D -0.3884 CHO-EFNB3
site wildtype mutant effect effect_std times_seen n_selections cell_type wildtype_site wt_type mutant_type
0 71 Q C -1.750 0.1777 4.625 8 CHO-bEFNB2 Q71 hydrophilic special
1 71 Q D -1.164 0.8890 4.500 8 CHO-bEFNB2 Q71 hydrophilic negative
site wildtype mutant effect_E2 effect_std_E2 times_seen_E2 n_selections_E2 cell_type_E2 wildtype_site_E2 wt_type_E2 mutant_type_E2 effect_E3 effect_std_E3 times_seen_E3 n_selections_E3 cell_type_E3 wildtype_site_E3 wt_type_E3 mutant_type_E3
0 71 Q C -1.750 0.1777 4.625 8.0 CHO-bEFNB2 Q71 hydrophilic special -0.7227 0.7828 3.000 7.0 CHO-bEFNB3 Q71 hydrophilic special
1 71 Q D -1.164 0.8890 4.500 8.0 CHO-bEFNB2 Q71 hydrophilic negative -0.3884 0.6369 3.429 7.0 CHO-bEFNB3 Q71 hydrophilic negative

Make heatmap of correlations between entry in CHO-bEFNB2 and CHO-bEFNB3¶

In [10]:
def correlation_heatmap(df):
    chart = (
        alt.Chart(df,title=alt.Title('Effects of RBP mutations on entry',subtitle='Between CHO cells expressing bat EFNB2 or EFNB3'))
        .mark_rect()
        .encode(
            alt.X("effect_E2", title="Entry in CHO-bEFNB2",axis=alt.Axis(values=[-4,-3,-2,-1,0,1])).bin(maxbins=75),
            alt.Y("effect_E3", title="Entry in CHO-bEFNB3",axis=alt.Axis(values=[-4,-3,-2,-1,0,1])).bin(maxbins=75),
            alt.Color('count():Q',title='Count').scale(type='log'),
            tooltip=['count()'],
        )
    ).properties(
    height=400,
    width=400,
).configure_legend(
    padding=2,
    orient='top-left', #"left", "right", "top", "bottom", "top-left", "top-right", "bottom-left", "bottom-right", "none"
    labelFontSize=16,
    titlePadding=2,
    symbolSize=100,
)

    return chart
corr_heatmap = correlation_heatmap(merged_df)
corr_heatmap.display()
if entry_by_site_plot_e3_output is not None:
    corr_heatmap.save(output_corr)

Make interactive plot linking individual binding and entry effects with top 10 summed binding and entry¶

In [11]:
def plot_entry_binding_interactive(df,name):
    #find contact sites
    df_copy = df.copy()
    df_copy.loc[:, 'is_contact'] = df_copy['site'].isin(config['contact_sites'])
    
    # Initialize a selection brush for interactive filtering.
    brush = alt.selection_interval() 
    #scatter plot
    chart = alt.Chart(
        df_copy
    ).mark_point(
        filled=True,
        size=50
    ).encode(
            alt.X("effect", title="Cell Entry", axis=alt.Axis(values=[-2,-1,0,1])),
            alt.Y("binding_mean", title="Binding", axis=alt.Axis(values=[-4,-2,0,2])),
            color=alt.condition(brush, 'is_contact', alt.value('lightgray')),  # Conditional color encoding based on selection.
            tooltip=["site", "wildtype", "mutant", "binding_mean","effect"]  # Define tooltip content for additional data on hover.
    ).add_params(
        brush
    ).properties(
        width=400, 
        height=400
    )  
    
    # Create a bar chart showing the sum of binding_median values for the top 10 sites filtered by the selection.
    bars_binding = alt.Chart(df_copy).transform_filter(
        brush  # Apply the selection filter to include only selected data.
    ).transform_aggregate(
        binding_aggr='sum(binding_mean)',  # Aggregate data by summing up binding_median.
        groupby=['site', 'is_contact']
    ).transform_window(
        rank='rank(binding_aggr)',  # Rank sites based on the aggregated sum.
        sort=[alt.SortField('binding_aggr', order='descending')]  # Sort by descending order of sum.
    ).transform_filter(
        alt.datum.rank <= 10  # Filter to keep only the top 10 ranked sites.
    ).mark_bar().encode(
        x=alt.X('site:N', sort='-y', title='Site', axis=alt.Axis(labelAngle=-90)),  # Encode site names on x-axis with custom sorting and label angle.
        y=alt.Y('binding_aggr:Q', title='Binding'),  # Encode aggregated sum on y-axis.
        color=alt.Color('is_contact', title='Receptor Contact Site')  # Color bars based on whether they are contact sites.
    ).properties(width=200, height=50)  # Set size of the bar chart.
    
    # Similar to the bars chart for binding_median, but aggregates and ranks sites based on the 'effect' value.
    bars_effect = alt.Chart(df_copy,title='Top 10').transform_filter(
        brush
    ).transform_aggregate(
        effect_aggr='sum(effect)',
        groupby=['site', 'is_contact']
    ).transform_window(
        rank='rank(effect_aggr)',
        sort=[alt.SortField('effect_aggr', order='descending')]
    ).transform_filter(
        alt.datum.rank <= 10
    ).mark_bar().encode(
        x=alt.X('site:N', sort='-y', title='Site', axis=alt.Axis(labelAngle=-90)),
        y=alt.Y('effect_aggr:Q', title='Entry'),
        color=alt.Color('is_contact', title='Receptor Contact Site')
    ).properties(width=200, height=50)
    
    # Combine the scatter plot with the two bar charts (stacked vertically and placed side by side).
    combined_chart = chart & (bars_effect | bars_binding)
    combined_chart = combined_chart.properties(
        title={
            "text": f"Cell Entry and Binding Analysis for {name}", 
            "subtitle": ["Draw box in scatterplot to show the top 10 sites by",
                         "summed binding and cell entry"],
            "color": "black",
            "subtitleColor": "gray"
        }
    )
    return combined_chart

entry_binding_corr_plot_E2 = plot_entry_binding_interactive(final_merged_df.query('cell_type == "CHO-EFNB2"'),'bEFNB2')
entry_binding_corr_plot_E2.display()
if entry_by_site_plot_e3_output is not None:
    entry_binding_corr_plot_E2.save(entry_binding_corr_plot_E2_output)

Now do the same above for for EFNB3¶

In [12]:
entry_binding_corr_plot_E3 = plot_entry_binding_interactive(final_merged_df.query('cell_type == "CHO-EFNB3"'),'bEFNB3')
entry_binding_corr_plot_E3.display()
if entry_by_site_plot_e3_output is not None:
    entry_binding_corr_plot_E3.save(entry_binding_corr_plot_E3_output)

Make plot based on region¶

In [13]:
def find_domain(df):
    barrel_ranges = {
        "Stalk": list(range(70, 148)),
        "Neck": list(range(148, 166)),
        "Linker": list(range(166, 178)),
        #"Receptor Contact": config["contact_sites"],

        "Head": list(range(178, 602)),
        #"Total": list(range(71, 602)),
    }
    agg_means = []
    # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = df[df["site"].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append(
                {
                    "region": barrel,
                    "binding_mean": row["binding_mean"],
                    "effect": row['effect'],
                    "site": row["site"],
                    "mutant": row["mutant"],
                    "wildtype": row["wildtype"],
                    "cell_type": row["cell_type"],
                }
            )
        agg_means_df = pd.DataFrame(agg_means)
    return agg_means_df
binding_entry_by_domain_df = find_domain(final_merged_df)
display(binding_entry_by_domain_df)


# Step 2: Use pivot_table to reshape the DataFrame
df_pivot = binding_entry_by_domain_df.pivot_table(index=['region', 'site', 'wildtype','mutant'], 
                          columns='cell_type', 
                          values=['effect', 'binding_mean'],
                          aggfunc='first').reset_index()

# Step 3: Flatten the hierarchical column index
df_pivot.columns = ['_'.join(col).strip() if col[1] else col[0] for col in df_pivot.columns.values]
#display(df_pivot)
# Optionally, rename the columns to your desired format
df_pivot.rename(columns={
    'effect_CHO-EFNB2': 'effect_E2',
    'effect_CHO-EFNB3': 'effect_E3',
    'binding_mean_CHO-EFNB2': 'binding_E2',
    'binding_mean_CHO-EFNB3': 'binding_E3'
}, inplace=True)
display(df_pivot)
region binding_mean effect site mutant wildtype cell_type
0 Stalk -0.78170 -1.16400 71 D Q CHO-EFNB2
1 Stalk 0.16590 -1.25500 71 E Q CHO-EFNB2
2 Stalk -0.34290 -1.05800 71 F Q CHO-EFNB2
3 Stalk 0.46570 -1.42500 71 G Q CHO-EFNB2
4 Stalk 0.02003 -0.37640 71 H Q CHO-EFNB2
... ... ... ... ... ... ... ...
19456 Head NaN -1.66700 601 F C CHO-EFNB3
19457 Head NaN -2.04700 601 G C CHO-EFNB3
19458 Head NaN -0.75770 601 I C CHO-EFNB3
19459 Head NaN -1.52300 601 P C CHO-EFNB3
19460 Head NaN 0.01403 601 V C CHO-EFNB3

19461 rows × 7 columns

region site wildtype mutant binding_E2 binding_E3 effect_E2 effect_E3
0 Head 178 V A 0.7066 0.008861 -0.2181 0.01306
1 Head 178 V C 0.1814 0.451400 0.1203 0.47640
2 Head 178 V D NaN -0.041930 -1.9200 -1.03800
3 Head 178 V E NaN 0.142800 -1.7900 -0.41900
4 Head 178 V F 0.5869 0.039550 -0.7901 -0.34260
... ... ... ... ... ... ... ... ...
9915 Stalk 147 K S 0.1344 -0.060950 0.1857 0.13650
9916 Stalk 147 K T 1.0700 -0.052750 -0.3402 -0.79560
9917 Stalk 147 K V NaN 0.086850 -1.9730 -1.02500
9918 Stalk 147 K W NaN NaN -2.9010 -2.27500
9919 Stalk 147 K Y NaN 0.146400 -2.9410 -1.39500

9920 rows × 8 columns

In [14]:
def correlation_plot(df):
    options = ['Stalk', 'Neck', 'Linker','Head']
    labels = [option + ' ' for option in options]
    
    input_dropdown = alt.binding_radio(
        # Add the empty selection which shows all when clicked
        options=options + [None],
        labels=labels + ['All'],
        name='Region: '
    )
    selection = alt.selection_point(
        fields=['region'],
        bind=input_dropdown,
    )    
    color = alt.condition(
    selection,
    alt.Color('region:N',scale=alt.Scale(domain=options)),
    alt.value('lightgray'),
    #sort=options,
    )

    opacity = alt.condition(
    selection,
    alt.value(1),
    alt.value(0.5)
    )
    effect_chart = (
        alt.Chart(df,title='Cell entry')
        .mark_point(size=30,opacity=1,filled=True)
        .encode(
            alt.X("effect_E2", title="Entry in CHO-bEFNB2",axis=alt.Axis(tickCount=4)),
            alt.Y("effect_E3", title="Entry in CHO-bEFNB3",axis=alt.Axis(tickCount=4)),
            tooltip=["wildtype","site", "mutant"],
            opacity=opacity,
            color=color,
        )
    ).properties(height=400,width=400)
    
    binding_chart = (
        alt.Chart(df,title='Receptor binding')
        .mark_point(size=30,opacity=1,filled=True)
        .encode(
            alt.X("binding_E2", title="bEFNB2 Binding",axis=alt.Axis(tickCount=4)),
            alt.Y("binding_E3", title="bEFNB3 Binding",axis=alt.Axis(tickCount=4)),
            tooltip=["site", "mutant"],
            color=color,
            opacity=opacity
        )
    ).properties(height=400,width=400)
    combined_chart = effect_chart | binding_chart
    combined_chart=combined_chart.add_params(selection) #.properties(title=alt.Title('Entry and Binding Correlations',subtitle='Select radio button to see mutants highlighted'))
    return combined_chart


corr_entry_binding_large = correlation_plot(df_pivot)
corr_entry_binding_large.display()
if entry_by_site_plot_e3_output is not None:
    corr_entry_binding_large.save(corr_entry_binding_large_output)
In [15]:
def make_custom_figure(df,name):
    brush = alt.selection_interval() #define selection brush
    custom_order = ["Stalk", "Neck", "Linker", "Head"]

    chart = (
        alt.Chart(
            df,
            title=alt.Title(
                f'{name}',
                #subtitle='Select points to see the top 10 summed sites'
            )
        )
        .mark_point(
            opacity=0.3, 
            filled=True
        )
        .encode(
            alt.X(
                "binding_mean",
                title=f"Binding",
                axis=alt.Axis(tickCount=4),
            ),
            alt.Y(
                "region:O",
                sort=custom_order,
                title="RBP Region",
            ),
            
            yOffset="random:Q",
            tooltip=["region", "binding_mean", "site", "mutant"],
            color=alt.condition(brush, 'region', alt.value('lightgray')),
        )
        .transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())")
    ).add_params(brush).properties(height=200,width=400)
    
    bars = alt.Chart(df).transform_filter(
            brush
        ).transform_aggregate(
            binding_aggr='sum(binding_mean)',
            groupby=['site', 'region']
        ).transform_window(
            rank='rank(binding_aggr)',
            sort=[alt.SortField('binding_aggr', order='descending')]
        ).transform_filter(
            alt.datum.rank <= 10
        ).mark_bar().encode(
            y=alt.Y('binding_aggr:Q',title='Binding'),
            x=alt.X('site:N', sort='-y',title='Site'),
            color=alt.Color('region',title='Region')
        ).properties(height=50,width=400)
    combined_chart = chart & bars 
    combined_chart
    return combined_chart
In [16]:
efnb2_binding_region = make_custom_figure(binding_entry_by_domain_df.query('cell_type == "CHO-EFNB2"'),'bEFNB2')
efnb2_binding_region.display()
#efnb2_binding_region.save('results/images/efnb2_binding_region.html')
In [17]:
efnb3_binding_region = make_custom_figure(binding_entry_by_domain_df.query('cell_type == "CHO-EFNB3"'),'bEFNB3')
efnb3_binding_region.display()
#efnb3_binding_region.save('results/images/efnb3_binding_region.html')
In [18]:
combined_binding = (efnb2_binding_region | efnb3_binding_region).properties(title=alt.Title('Receptor binding by RBP mutant',subtitle='Draw boxes around scatter plots to see top sites'))
combined_binding.display()
if entry_by_site_plot_e3_output is not None:
    combined_binding.save(combined_binding_output)
In [19]:
def entry_by_site(df):
    tmp_df = df.groupby(['site','cell_type'])['effect'].mean().reset_index()
    # define ranges of different RBP regions
    barrel_ranges = {
        "Stalk": list(range(70, 148)),
        "Neck": list(range(148, 166)),
        "Linker": list(range(166, 178)),
        "Head": list(range(178, 602)),
    }
    custom_order = ["Stalk", "Neck", "Linker", "Head"] #custom order for color legend

    # For each barrel, filter the dataframe to the sites belonging to that barrel and then store the means
    agg_means = [] #store aggregation 
    for barrel, sites in barrel_ranges.items():
        subset = tmp_df[tmp_df["site"].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append(
                {"region": barrel, 
                 "effect": row["effect"], 
                 "site": row["site"],
                 "cell_type": row["cell_type"],
                }
            )
        agg_means_df = pd.DataFrame(agg_means).round(3)
    agg_means_df['beta_sheet'] = agg_means_df['site'].isin(config['beta_sheet']) #add a column specifying which sites are in beta sheets
    ### The main chart plotting

    variant_selector = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=["site"], value=1
    )
    chart = (
        alt.Chart(
            agg_means_df,        
        )
        .mark_bar(opacity=1,stroke='black')
        .encode(
            alt.X("site:N", title='Site',axis=alt.Axis(labelAngle=-90,values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
            alt.Y("effect", title="Mean entry"),
            tooltip=["site", "effect","region"],
            color=alt.Color('region',sort=custom_order,title='Region'),
            strokeWidth=alt.condition(
                    variant_selector, alt.value(1), alt.value(0)
                ),
            row=alt.Row('cell_type',title=None,header=alt.Header(labelFontSize=15,labelFontWeight='bold'))
        )
    ).properties(width=800,height=150)
    
    ### Draw rectanges showing where beta sheets are in protein above chart
    rect = alt.Chart(agg_means_df).mark_rect(color='gray').encode(
        alt.X('site:N',axis=None),
        alt.Y('beta_sheet',axis=None),
        tooltip=['site','beta_sheet']
    ).transform_filter('datum.beta_sheet == true').properties(width=800,height=10)

    #select_bar = alt.Chart(agg_means_df).mark_rect(color='gray').encode(
        
    
    #combined_chart = alt.layer(chart,rect)
    combined_chart = alt.vconcat(rect,chart,padding=0).resolve_scale(y='independent',x='shared')
    combined_chart = combined_chart.properties(title=alt.Title('Cell entry by RBP mutations',
                                                               subtitle=['Hover mouse over bars to view information about cell entry','Gray bars are beta sheets'])).add_params(variant_selector)
    return combined_chart

entry_by_site_plot = entry_by_site(concat_df)
entry_by_site_plot.display()
#entry_by_site_plot.save('results/images/entry_by_site_plot.html')

Make interactive chart for individual mutations¶

In [20]:
def entry_by_site(df,name,effect):
    amino_acid_order = ["R","K","H","D","E","Q","N","S","T","Y","W","F","A","I","L","M","V","G","P","C"]
    # make an empty dataframe with every possible mutation so empty values still get plotted
    sites = range(71, 603)
    data = [{"site": site, "mutant": aa} for site in sites for aa in amino_acid_order]
    empty_df = pd.DataFrame(data)
    full_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
 
    #setup interactive features
    variant_selector = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=["site"], value=1
    )
    
    #make base chart
    base = alt.Chart(full_df).add_params(variant_selector)
    #add bar chart of cell entry by site
    chart = base.mark_bar(opacity=1,stroke='black').encode(
            alt.X("site:N", title='Site',axis=alt.Axis(values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
            alt.Y(f"mean({effect})", title="Mean entry"),
            tooltip=["site", "wildtype","region"],
            opacity=alt.condition(variant_selector,alt.value(1),alt.value(0.7)),
            strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
            color=alt.Color('region',title='Region')
    ).properties(width=800,height=200)

    #add heatmap below showing effect of mutants on cell entry
    select_bar = alt.Chart(full_df).mark_bar(stroke='black').encode(
        alt.X('mutant:N',title=None,scale=alt.Scale(domain=amino_acid_order)),
        color=alt.Color(f'{effect}',legend=alt.Legend(orient='right',direction='horizontal',titleAlign='center',titleAnchor='middle'),title='Cell entry',scale=alt.Scale(scheme='redblue',domainMid=0,domain=[-4,2]))
    ).transform_filter(
        variant_selector
    ).properties(width=400,height=10)

    #make heatmap have an x for wildtype residue
    select_bar_wildtype = alt.Chart(full_df).mark_text(color="black", text="X", size=10, align="center", baseline="middle").encode(
        alt.X('wildtype:N',title='Amino acid'),
    ).transform_filter(
        variant_selector
    ).transform_filter(
        (alt.datum[effect] != None) #filter out empty data
    ).properties(width=400,height=10)
    combined_effects_w_wildtype = alt.layer(select_bar_wildtype,select_bar).resolve_scale(x='shared')

    # combine the charts
    combined_chart = alt.vconcat(chart,combined_effects_w_wildtype).resolve_scale(y='independent',x='independent')
    combined_chart = combined_chart = combined_chart.properties(title=alt.Title(f'Entry in {name}',
                                                               subtitle=['Hover over bars to view information about specific mutations']))
    return combined_chart
In [21]:
# call chart function
entry_by_site_plot_e2 = entry_by_site(df_pivot,'CHO-bEFNB2','effect_E2')
entry_by_site_plot_e2.display()
if entry_by_site_plot_e3_output is not None:
    entry_by_site_plot_e2.save(entry_by_site_plot_e2_output)
In [22]:
# call chart function
entry_by_site_plot_e3 = entry_by_site(df_pivot,'CHO-bEFNB3','effect_E3')
entry_by_site_plot_e3.display()
if entry_by_site_plot_e3_output is not None:
    entry_by_site_plot_e3.save(entry_by_site_plot_e3_output)
In [23]:
combined_entry_by_site = (entry_by_site_plot_e2 & entry_by_site_plot_e3)
combined_entry_by_site.display()

Testing stuff below¶

In [24]:
def make_effect_by_site_with_hover_tooltip(df):
    tmp_df = df.groupby(['cell_type','site'])['effect'].mean().reset_index().round(2)
    
    # Create a selection that chooses the nearest point & selects based on x-value
    nearest = alt.selection_point(nearest=True, on='mouseover',
                            fields=['site'], empty=False)
    
    # The basic line
    line = alt.Chart(tmp_df).mark_line(interpolate='basis',size=1).encode(
        alt.X('site:Q', title='Site',axis=alt.Axis(values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600])),
        alt.Y('effect:Q',title='Mean entry'),
        color=alt.Color('cell_type:N',title='Cell type')
    )
    
    # Transparent selectors across the chart. This is what tells us
    # the x-value of the cursor
    selectors = alt.Chart(tmp_df).mark_point().encode(
        alt.X('site:Q'),
        opacity=alt.value(0),
    ).add_params(
        nearest
    )
    # Draw points on the line, and highlight based on selection
    points = line.mark_point().encode(
        opacity=alt.condition(nearest, alt.value(1), alt.value(0))
    )
    
    # Draw text labels near the points, and highlight based on selection
    text = line.mark_text(align='left', dx=5, dy=-5,fontSize=15).encode(
        text=alt.condition(nearest, 'effect:Q', alt.value(' ')),
        #color=alt.value('black')
    )#.transform_filter(alt.datum.cell_type == 'CHO-EFNB2')
    
    # Draw a rule at the location of the selection
    rules = alt.Chart(tmp_df).mark_rule(color='gray').encode(
        x='site:Q',
    ).transform_filter(
        nearest
    )
    # Put the five layers into a chart and bind the data
    combined_chart = alt.layer(
        line, selectors, points, rules, text
    ).properties(
        width=800, height=200
    )
    return combined_chart

alt_plot = make_effect_by_site_with_hover_tooltip(binding_entry_by_domain_df)
alt_plot.display()
In [ ]:
 
In [25]:
display(df_pivot)
region site wildtype mutant binding_E2 binding_E3 effect_E2 effect_E3
0 Head 178 V A 0.7066 0.008861 -0.2181 0.01306
1 Head 178 V C 0.1814 0.451400 0.1203 0.47640
2 Head 178 V D NaN -0.041930 -1.9200 -1.03800
3 Head 178 V E NaN 0.142800 -1.7900 -0.41900
4 Head 178 V F 0.5869 0.039550 -0.7901 -0.34260
... ... ... ... ... ... ... ... ...
9915 Stalk 147 K S 0.1344 -0.060950 0.1857 0.13650
9916 Stalk 147 K T 1.0700 -0.052750 -0.3402 -0.79560
9917 Stalk 147 K V NaN 0.086850 -1.9730 -1.02500
9918 Stalk 147 K W NaN NaN -2.9010 -2.27500
9919 Stalk 147 K Y NaN 0.146400 -2.9410 -1.39500

9920 rows × 8 columns

In [26]:
aggregated_df = df_pivot.groupby(['site']).agg({
        #'binding_E2': 'mean',
        #'binding_E3': 'mean',
        'effect_E2': 'mean',
        'effect_E3': 'mean',
        'region': 'first',
        'wildtype': 'first',
        'region': 'first'

    }).reset_index().round(2)

melted_df = pd.melt(aggregated_df, id_vars=['site','wildtype'], value_vars=['effect_E2','effect_E3','region'], var_name='selection', value_name='effect')
display(melted_df)
site wildtype selection effect
0 71 Q effect_E2 -1.18
1 72 N effect_E2 -1.23
2 73 Y effect_E2 -0.74
3 74 T effect_E2 -0.68
4 75 R effect_E2 -0.73
... ... ... ... ...
1588 597 I region Head
1589 598 P region Head
1590 599 E region Head
1591 600 Q region Head
1592 601 C region Head

1593 rows × 4 columns

In [27]:
full_ranges = [
    list(range(start, end))
    for start, end in [(71, 136), (136, 201), (201, 266), (266, 331), (331, 396), (396, 461), (461, 526), (526, 591), (591, 602)]
        #(71, 181),
        #(181, 291),
        #(291, 401),
        #(401, 511),
        #(511, 603),
]
print(full_ranges)
[[71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135], [136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200], [201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265], [266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330], [331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395], [396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460], [461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525], [526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590], [591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601]]
In [28]:
df = pd.DataFrame(
        {
            "site": config["contact_sites"],
            "contact": [0.0] * len(config["contact_sites"]),
        }
    )
df["selection"] = "receptor contact"
df.rename(columns={"contact": "effect"}, inplace=True)
melted_df = pd.concat([df,melted_df])
display(melted_df)
site effect selection wildtype
0 239 0.0 receptor contact NaN
1 240 0.0 receptor contact NaN
2 241 0.0 receptor contact NaN
3 242 0.0 receptor contact NaN
4 305 0.0 receptor contact NaN
... ... ... ... ...
1588 597 Head region I
1589 598 Head region P
1590 599 Head region E
1591 600 Head region Q
1592 601 Head region C

1626 rows × 4 columns

In [29]:
empty_chart = []
to_sort = ['receptor_contact','effect_E2','effect_E3']

for idx,site_subset in enumerate(full_ranges):
    tmp_df = melted_df[melted_df['site'].isin(site_subset)]
    display(tmp_df)


    
    is_last_plot = idx == len(full_ranges) - 1

    x_axis = alt.Axis(
        labelAngle=-90,
        labelExpr="datum.value % 10 === 0 ? datum.value : ''",
        title="Site" if is_last_plot else None,
        labels=True,
    )
    
    effect_legend = (
        alt.Legend(
            #title=legend_title,
            direction="horizontal",
            gradientLength=150,
            titleAnchor="middle",
            tickCount=3,
            labelAlign="center",
        )
        if is_last_plot
        else None
    )
    
    
    base = alt.Chart(tmp_df).encode(
        alt.X("site:O", title="Site",axis=x_axis),
        alt.Y('selection:N',title=None,sort=to_sort),
        tooltip=['site','selection','effect']
    ).properties(width=alt.Step(10),height=alt.Step(10))

    entry = base.mark_rect(stroke='black',strokeWidth=0.5).encode(
        alt.Color('effect',legend=effect_legend)
            .scale(scheme='redblue',domainMid=0,domain=[-4,2])
    ).transform_filter(
        (alt.datum.selection == "effect_E2") | (alt.datum.selection == "effect_E3")
    )
    
    region = base.mark_rect(stroke='black',strokeWidth=0.5).encode(
        alt.Color('effect',legend=effect_legend)
    ).transform_filter(alt.datum.selection == 'region')

    contact = base.mark_rect(color='black',stroke='black',strokeWidth=0.5).encode(
    ).transform_filter(
        (alt.datum.selection == "receptor contact")
    )
    
    tmp_chart = alt.layer(region, entry,contact)
    empty_chart.append(tmp_chart)

test = alt.vconcat(*empty_chart,spacing=0)
test
site effect selection wildtype
0 71 -1.18 effect_E2 Q
1 72 -1.23 effect_E2 N
2 73 -0.74 effect_E2 Y
3 74 -0.68 effect_E2 T
4 75 -0.73 effect_E2 R
... ... ... ... ...
1122 131 Stalk region I
1123 132 Stalk region S
1124 133 Stalk region Q
1125 134 Stalk region S
1126 135 Stalk region T

195 rows × 4 columns

site effect selection wildtype
65 136 -0.6 effect_E2 A
66 137 -0.41 effect_E2 S
67 138 -2.86 effect_E2 I
68 139 -0.96 effect_E2 N
69 140 0.07 effect_E2 E
... ... ... ... ...
1187 196 Head region Q
1188 197 Head region I
1189 198 Head region L
1190 199 Head region K
1191 200 Head region P

195 rows × 4 columns

site effect selection wildtype
0 239 0.0 receptor contact NaN
1 240 0.0 receptor contact NaN
2 241 0.0 receptor contact NaN
3 242 0.0 receptor contact NaN
130 201 -1.32 effect_E2 K
... ... ... ... ...
1252 261 Head region E
1253 262 Head region V
1254 263 Head region P
1255 264 Head region S
1256 265 Head region L

199 rows × 4 columns

site effect selection wildtype
4 305 0.0 receptor contact NaN
195 266 -2.89 effect_E2 F
196 267 -1.96 effect_E2 M
197 268 -0.9 effect_E2 T
198 269 -0.99 effect_E2 N
... ... ... ... ...
1317 326 Head region N
1318 327 Head region G
1319 328 Head region G
1320 329 Head region G
1321 330 Head region Y

196 rows × 4 columns

site effect selection wildtype
5 388 0.0 receptor contact NaN
6 389 0.0 receptor contact NaN
260 331 -1.13 effect_E2 N
261 332 -0.08 effect_E2 Q
262 333 -0.18 effect_E2 H
... ... ... ... ...
1382 391 Head region K
1383 392 Head region P
1384 393 Head region E
1385 394 Head region N
1386 395 Head region C

197 rows × 4 columns

site effect selection wildtype
7 401 0.0 receptor contact NaN
8 402 0.0 receptor contact NaN
9 458 0.0 receptor contact NaN
325 396 -0.56 effect_E2 R
326 397 -0.31 effect_E2 L
... ... ... ... ...
1447 456 Head region A
1448 457 Head region S
1449 458 Head region F
1450 459 Head region S
1451 460 Head region W

198 rows × 4 columns

site effect selection wildtype
10 488 0.0 receptor contact NaN
11 489 0.0 receptor contact NaN
12 490 0.0 receptor contact NaN
13 491 0.0 receptor contact NaN
14 492 0.0 receptor contact NaN
... ... ... ... ...
1512 521 Head region S
1513 522 Head region A
1514 523 Head region G
1515 524 Head region V
1516 525 Head region F

205 rows × 4 columns

site effect selection wildtype
20 530 0.0 receptor contact NaN
21 531 0.0 receptor contact NaN
22 532 0.0 receptor contact NaN
23 533 0.0 receptor contact NaN
24 555 0.0 receptor contact NaN
... ... ... ... ...
1577 586 Head region N
1578 587 Head region V
1579 588 Head region I
1580 589 Head region R
1581 590 Head region P

208 rows × 4 columns

site effect selection wildtype
520 591 -0.49 effect_E2 K
521 592 -2.07 effect_E2 L
522 593 -1.86 effect_E2 F
523 594 -2.54 effect_E2 A
524 595 -1.22 effect_E2 V
525 596 -0.0 effect_E2 K
526 597 -2.19 effect_E2 I
527 598 -1.04 effect_E2 P
528 599 0.01 effect_E2 E
529 600 0.07 effect_E2 Q
530 601 -1.06 effect_E2 C
1051 591 -1.46 effect_E3 K
1052 592 -1.72 effect_E3 L
1053 593 -1.93 effect_E3 F
1054 594 -2.25 effect_E3 A
1055 595 -0.53 effect_E3 V
1056 596 -0.0 effect_E3 K
1057 597 -2.06 effect_E3 I
1058 598 -0.82 effect_E3 P
1059 599 0.12 effect_E3 E
1060 600 0.18 effect_E3 Q
1061 601 -0.77 effect_E3 C
1582 591 Head region K
1583 592 Head region L
1584 593 Head region F
1585 594 Head region A
1586 595 Head region V
1587 596 Head region K
1588 597 Head region I
1589 598 Head region P
1590 599 Head region E
1591 600 Head region Q
1592 601 Head region C
Out[29]:
In [ ]: